Description of features:

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# custom plot settings
size=20
params = {'legend.fontsize': 'large',
'legend.title_fontsize': size*0.75,
'legend.frameon': False,
'figure.figsize': (8,4),
'axes.labelsize': size,
'axes.titlesize': size,
'xtick.labelsize': size*0.75,
'ytick.labelsize': size*0.75,
'axes.titlepad': 10,
'figure.subplot.hspace': 0.9,
'font.sans-serif': 'Arial'}
# matplotlib.rcParams.keys() # to see editable options
plt.rcParams.update(params)
# import data
df = pd.read_excel('../data/gen/collapsed_data.xlsx')
# create amplitude ratio feature
df['amp_ratio'] = df.amplitude_1 / df.amplitude_3
# create 'area under curve' feature (not really area, but correlated)
df['auc_1'] = df.amplitude_1 * df.duration_2
df['auc_2'] = df.amplitude_3 * (df.duration_3 - df.duration_2)
# add gender to df
male_list = [True, True, True, False, True, True, False, True, True, False]
df['male'] = df.person_id.apply(lambda x: male_list[x-1])
pd.crosstab(df.person_id, df.male)
# check df
print(df.isnull().sum())
display(df.describe())
df.head()
# Mean of each feature by swallow volume
dfmean = df.groupby('swallow_volume').agg('mean').reset_index()
for x in [v for v in dfmean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
plt.scatter(dfmean.swallow_volume, dfmean[x])
plt.title(x)
plt.show()
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
display(df.boxplot(column=[x], by=['swallow_volume']))
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
display(df.boxplot(column=[x], by=['male']))
plt.title('')
plt.show()
# demean function
def demean(df, var = ['person_id']):
"""Demean df grouped by var, and divides by std dev. Returns a df.
"""
# get means
dfmeans = df.groupby(var).agg('mean').reset_index()
# get std devs
dfstd = df.groupby(var).agg('std').reset_index()
# loop through cols and demean
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
dftmp = dfmeans[var + [x]]
dftmp = pd.concat([dftmp, dfstd[x]], 1)
dftmp.columns = var + ['xmean', 'xstd']
df = df.merge(dftmp, on=var, how='inner')
df[x] = (df[x] - df.xmean) / df.xstd
df.drop(['xmean', 'xstd'], 1, inplace=True)
return df
dfdemean = demean(df, ['person_id'])
dfdemean.head()
# plot demeaned boxplots
for x in [v for v in dfdemean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
dfdemean.boxplot(column=[x], by=['swallow_volume'])
import seaborn as sns
# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 5) | (dfdemean.person_id == 7), :]
for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
print(y)
# Plot
sns.set(font_scale=1.5)
sns.set_style("ticks")
fig = plt.figure(figsize=(8,5))
ax1 = fig.add_subplot(111) # row, col, num
sns.stripplot(x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1, size=8)
sns.pointplot(x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False, scale=1.5,
zorder=100, color='black')
ax1.set_xlabel('volume (mL)')
ax1.set_ylabel(y)
ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
# subplot with amp1, amp2, ampr, auc1
# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume <= 15)]
# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))
# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')
# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')
for i in range(2):
for j in range(2):
if i == 0:
ax[i, j].set_xlabel('')
else:
ax[i, j].set_xlabel('Swallow Volume (ml)')
ax[i,j].get_legend().remove()
fig.tight_layout()
# subplot with amp1, amp2, ampr, auc1
# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume > 0)]
# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))
# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False,
zorder=100, color='black')
ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')
# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')
for i in range(2):
for j in range(2):
if i == 0:
ax[i, j].set_xlabel('')
else:
ax[i, j].set_xlabel('Swallow Volume (ml)')
ax[i,j].get_legend().remove()
fig.tight_layout()
# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 7) & (dfdemean.person_id != 6), :]
for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
print(y)
# Plot
sns.set(font_scale=1.5)
sns.set_style("ticks")
fig = plt.figure(figsize=(8,5))
ax1 = fig.add_subplot(111) # row, col, num
sns.stripplot(x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1, size=8)
sns.pointplot(x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False, scale=1.5,
zorder=100, color='black')
ax1.set_xlabel('volume (mL)')
ax1.set_ylabel(y)
ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
# get subset
dfsub = dfdemean.loc[dfdemean.person_id >= 8, :]
for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
print(y)
# Plot
sns.set(font_scale=1.5)
sns.set_style("ticks")
fig = plt.figure(figsize=(8,5))
ax1 = fig.add_subplot(111) # row, col, num
sns.stripplot(x='swallow_volume', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1, size=8)
sns.pointplot(x='swallow_volume', y=y, ci='sd',
data=dfsub, join=False, scale=1.5,
zorder=100, color='black')
ax1.set_xlabel('volume (mL)')
ax1.set_ylabel(y)
ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
# get subset
dfsub = df.loc[df.person_id >= 0, :]
for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
print(y)
# Plot
sns.set(font_scale=1.5)
sns.set_style("ticks")
fig = plt.figure(figsize=(8,5))
ax1 = fig.add_subplot(111) # row, col, num
sns.stripplot(x='male', y=y, hue='person_id',
data=dfsub, dodge=True, jitter=True, alpha=.40,
zorder=1, size=8)
sns.pointplot(x='male', y=y, ci='sd',
data=dfsub, join=False, scale=1.5,
zorder=100, color='black')
ax1.set_xlabel('Male (Boolean)')
ax1.set_ylabel(y)
ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
# fit T-SNE and plot
from sklearn.manifold import TSNE
# get subset of data
dfsub = df.loc[df.person_id >= 0,
[c for c in df.columns if
c not in ['observation', 'swallow_id', 'emg']]]
# rescale data
def rescale(vec):
return (vec - vec.mean()) / vec.std()
for v in [c for c in dfsub.columns if c not in ['person_id', 'swallow_volume']]:
dfsub[v] = dfsub.groupby(['person_id'])[v].apply(lambda x: rescale(x))
# define X and Y vecs
X = dfsub.iloc[:, 2:-1]
Y = dfsub.loc[:, 'swallow_volume']
# fit model
X_embedded = TSNE(n_components = 2,
perplexity = 50,
learning_rate = 100,
n_iter = 5000).fit_transform(X, Y)
print(X_embedded.shape)
# merge results
dfm = pd.concat([dfsub.reset_index(drop=True),
pd.DataFrame(X_embedded, columns=['tsne0', 'tsne1'])], 1)
dfm.head()
# plot t-sne, color by person
# color map
cmap = plt.cm.jet
fig, ax = plt.subplots()
scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.person_id, cmap=cmap)
ax.legend(*scatter.legend_elements(num=10), ncol=2,
loc="center left", bbox_to_anchor=(1, 0.5), title="person id")
plt.title('Colored by person id')
plt.show()
# plot t-sne, color by swallow volume
cmap = matplotlib.cm.Blues(np.linspace(0,1,10))
cmap = matplotlib.colors.ListedColormap(cmap[2:,:-1])
fig, ax = plt.subplots()
scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.swallow_volume, cmap=cmap)
ax.legend(*scatter.legend_elements(num=6),
loc="center left", bbox_to_anchor=(1, 0.5), title="swallow volume")
plt.title('Colored by swallow volume')
plt.show()
# plot t-sne, color by gender
fig, ax = plt.subplots()
scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=df.male, cmap=cmap)
ax.legend(*scatter.legend_elements(num=2),
loc="center left", bbox_to_anchor=(1, 0.5), title="Male (Boolean)")
plt.title('Colored by gender')
plt.show()